# Computations
import numpy as np
import pandas as pd
from scipy.stats import norm
# preprocessing
from sklearn.preprocessing import StandardScaler
# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
# Pytorch
import torch
from torch.autograd import Variable
import torch.nn as nn
import torchvision.transforms as transforms
# Visualisation libraries
## Progress Bar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze and predict customer churn for Telco Customer Churn data.
| Columns | Description |
|---|---|
| customerID | Customer ID |
| gender | Whether the customer is a male or a female |
| SeniorCitizen | Whether the customer is a senior citizen or not (1, 0) |
| Partner | Whether the customer has a partner or not (Yes, No) |
| Dependents | Whether the customer has dependents or not (Yes, No) |
| tenure | Number of months the customer has stayed with the company |
| PhoneService | Whether the customer has a phone service or not (Yes, No) |
| MultipleLines | Whether the customer has multiple lines or not (Yes, No, No phone service) |
| InternetService | Customer’s internet service provider (DSL, Fiber optic, No) |
| OnlineSecurity | Whether the customer has online security or not (Yes, No, No internet service) |
| OnlineBackup | Whether the customer has an online backup or not (Yes, No, No internet service) |
| DeviceProtection | Whether the customer has device protection or not (Yes, No, No internet service) |
| TechSupport | Whether the customer has tech support or not (Yes, No, No internet service) |
| StreamingTV | Whether the customer has streaming TV or not (Yes, No, No internet service) |
| StreamingMovies | Whether the customer has streaming movies or not (Yes, No, No internet service) |
| Contract | The contract term of the customer (Month-to-month, One year, Two years) |
| PaperlessBilling | Whether the customer has paperless billing or not (Yes, No) |
| PaymentMethod | The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)) |
| MonthlyCharges | The amount charged to the customer monthly |
| TotalCharges | The total amount charged to the customer |
| Churn | Whether the customer churned or not (Yes or No) |
Path = 'telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv'
Data = pd.read_csv(Path.split(".")[0]+'_STD.csv')
df = Data.drop(columns = ['customer ID'])
Target = 'Churn'
Labels = ['No', 'Yes']
display(Data.head(6).style.hide_index().set_precision(2))
| customer ID | Gender | Senior Citizen | Partner | Dependents | Tenure | Phone Service | Multiple Lines | Internet Service | Online Security | Online Backup | Device Protection | Tech Support | Streaming TV | Streaming Movies | Contract | Paperless Billing | Monthly Charges | Total Charges | Churn | Bank transfer (automatic) | Credit card (automatic) | Electronic check | Mailed check |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7590-VHVEG | -1.01 | -0.44 | 1.03 | -0.65 | -1.28 | -3.05 | -0.85 | -0.29 | -0.10 | 1.18 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | -1.16 | -0.99 | 0 | -0.53 | -0.53 | 1.41 | -0.54 |
| 5575-GNVDE | 0.99 | -0.44 | -0.97 | -0.65 | 0.07 | 0.33 | -0.85 | -0.29 | 1.32 | -0.17 | 1.18 | -0.10 | -0.22 | -0.23 | 0.37 | -1.21 | -0.26 | -0.17 | 0 | -0.53 | -0.53 | -0.71 | 1.84 |
| 3668-QPYBK | 0.99 | -0.44 | -0.97 | -0.65 | -1.24 | 0.33 | -0.85 | -0.29 | 1.32 | 1.18 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | -0.36 | -0.96 | 1 | -0.53 | -0.53 | -0.71 | 1.84 |
| 7795-CFOCW | 0.99 | -0.44 | -0.97 | -0.65 | 0.51 | -3.05 | -0.85 | -0.29 | 1.32 | -0.17 | 1.18 | 1.31 | -0.22 | -0.23 | 0.37 | -1.21 | -0.75 | -0.20 | 0 | 1.89 | -0.53 | -0.71 | -0.54 |
| 9237-HQITU | -1.01 | -0.44 | -0.97 | -0.65 | -1.24 | 0.33 | -0.85 | 1.00 | -0.10 | -0.17 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | 0.20 | -0.94 | 1 | -0.53 | -0.53 | 1.41 | -0.54 |
| 9305-CDSKC | -1.01 | -0.44 | -0.97 | -0.65 | -0.99 | 0.33 | 1.17 | 1.00 | -0.10 | -0.17 | 1.18 | -0.10 | 1.10 | 1.09 | -0.83 | 0.83 | 1.16 | -0.65 | 1 | -0.53 | -0.53 | 1.41 | -0.54 |
Similarly, here we use StratifiedKFold which is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
X = df.drop(columns = Target).values
y = df[Target].values
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Plot(y_train = y_train, y_test = y_test, Colors = ['FireBrick', 'SeaGreen']):
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}]*2])
_, Temp = np.unique(y_train, return_counts=True)
fig.add_trace(go.Pie(labels=Labels,
values= Temp,
pull=[0, 0.1],
name= 'Train Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
_, Temp = np.unique(y_test, return_counts=True)
fig.add_trace(go.Pie(labels=Labels,
values=Temp,
pull=[0, 0.1],
name= 'Test Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"),
legend_title_text= Target,
annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.195, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.8, y=0.5, font_size=14, showarrow=False)],
title={'text': '<b>' + Target + '<b>', 'x':0.48, 'y': .83, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Train_Test_Plot(Colors = ['SeaGreen', 'Tomato'])
A multi-layer perceptron (MLP) is a class of feedforward artificial neural network (ANN). The algorithm at each iteration uses the Cross-Entropy Loss to measure the loss, and then the gradient and the model update is calculated. At the end of this iterative process, we would reach a better level of agreement between test and predicted sets since the error would be lower from that of the first step.
def TorchSets(Set):
# Inut: Arrays
# GPU Cuda
if torch.cuda.is_available():
if Set.ndim==1:
Out = Variable(torch.from_numpy(Set).type(torch.LongTensor).cuda())
else:
Out = Variable(torch.from_numpy(Set).cuda())
# CPU
else:
if Set.ndim==1:
Out = Variable(torch.from_numpy(Set).type(torch.LongTensor))
else:
Out = Variable(torch.from_numpy(Set))
return Out
# Tensors
X_train_tensor = TorchSets(X_train)
y_train_tensor = TorchSets(y_train)
X_test_tensor = TorchSets(X_test)
y_test_tensor = TorchSets(y_test)
Batch_size = 100
iteration_number = int(2e4)
epochs_number = int(iteration_number / (len(X_train) / Batch_size))
# Pytorch train and test sets
Train_set = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
Test_set = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
# data loader
train_loader = torch.utils.data.DataLoader(Train_set, batch_size = Batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(Train_set, batch_size = Batch_size, shuffle = False)
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Plot_history(history, Table_Rows = 25, yLim = 2):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "scatter"},{"type": "table"}]])
# Left
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Loss'].astype(float).values.round(4),
line=dict(color='OrangeRed', width= 1.5), name = 'Loss'), 1, 1)
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Accuracy'].astype(float).values,
line=dict(color='MidnightBlue', width= 1.5), name = 'Accuracy'), 1, 1)
fig.update_layout(legend=dict(x=0, y=1.1, traceorder='reversed', font_size=12),
dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest',
legend_orientation='h')
fig.update_xaxes(range=[history.Iteration.min(), history.Iteration.max()],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
fig.update_yaxes(range=[0, yLim], showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
# Right
ind = np.linspace(0, history.shape[0], Table_Rows, endpoint = False).round(0).astype(int)
ind = np.append(ind, history.index[-1])
history = history[history.index.isin(ind)]
T = history.copy()
T[['Loss','Accuracy']] = T[['Loss','Accuracy']].applymap(lambda x: '%.4e' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(history.columns), line_color='darkslategray',
fill_color='Navy', align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.4, 0.4, 0.4],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color=['Lavender', 'white', 'white']),
align=['center', 'center'], font_size=12,height=20)), 1, 2)
fig.show()
# Model
class MLP_Model(nn.Module):
def __init__(self, input_Size, hidden_Size, output_Size):
super(MLP_Model, self).__init__()
# Linear function 1:
self.fc1 = nn.Linear(input_Size, hidden_Size)
# Non-linearity 1
self.relu1 = nn.ReLU()
# Linear function 2:
self.fc2 = nn.Linear(hidden_Size, hidden_Size)
# Non-linearity 2
self.tanh2 = nn.Tanh()
# Linear function 3:
self.fc3 = nn.Linear(hidden_Size, hidden_Size)
# Non-linearity 3
self.elu3 = nn.ELU()
# Linear function 4:
self.fc4 = nn.Linear(hidden_Size, hidden_Size)
# Non-linearity 4
self.elu4 = nn.ELU()
# Linear function 5:
self.fc5 = nn.Linear(hidden_Size, output_Size)
def forward(self, x):
# Linear function 1
out = self.fc1(x)
# Non-linearity 1
out = self.relu1(out)
# Linear function 2
out = self.fc2(out)
# Non-linearity 2
out = self.tanh2(out)
# Linear function 3
out = self.fc3(out)
# Non-linearity 3
out = self.elu3(out)
# Linear function 4
out = self.fc4(out)
# Non-linearity 4
out = self.elu4(out)
# Linear function 5 (readout)
out = self.fc5(out)
return out
input_Size, output_Size = len(X[0]), len(np.unique(y))
hidden_Size = 64
# model
model = MLP_Model(input_Size, hidden_Size, output_Size)
# GPU
if torch.cuda.is_available():
model.cuda()
# Cross Entropy Loss
CEL= nn.CrossEntropyLoss()
# Optimizer
learning_rate = 1e-2
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Traning the Model
Count = 0
Loss_list = []
Iteration_list = []
Accuracy_list = []
MSE_list = []
MAE_list = []
Steps = 10
Progress_Bar = progressbar.ProgressBar(maxval= iteration_number + 300,
widgets=[progressbar.Bar('=', '|', '|'),
progressbar.Percentage()])
# print('---------------------------------------------------------')
for epoch in range(epochs_number):
for i, (Xtr, ytr) in enumerate(train_loader):
# Variables
Xtr = Variable(Xtr.view(-1, X[0].shape[0]))
ytr = Variable(ytr)
# Set all gradients to zero
optimizer.zero_grad()
# Forward
Out = model(Xtr.float())
# loss
loss = CEL(Out, ytr.long())
# Backward (Calculating the gradients)
loss.backward()
# Update parameters
optimizer.step()
Count += 1
del Xtr, ytr
# Predictions
if Count % Steps == 0:
# Calculate Accuracy
Correct, Total = 0, 0
# Predictions
for Xts, yts in test_loader:
Xts = Variable(Xts.view(-1, X[0].shape[0]))
# Forward
Out = model(Xts.float())
# The maximum value of Out
Predicted = torch.max(Out.data, 1)[1]
# Total number of yts
Total += len(yts)
# Total Correct predictions
Correct += (Predicted == yts).sum()
del Xts, yts
# storing loss and iteration
Loss_list.append(loss.data)
Iteration_list.append(Count)
Accuracy_list.append(Correct / float(Total))
Progress_Bar.update(Count)
Progress_Bar.finish()
history = pd.DataFrame({'Iteration': np.array(Iteration_list),
'Loss': np.array([x.cpu().data.numpy() for x in Loss_list]),
'Accuracy': np.array([x.cpu().data.numpy() for x in Accuracy_list])})
del Loss_list, Iteration_list, Accuracy_list
|=========================================================================|100%
Model Performance
Plot_history(history, Table_Rows = 18, yLim = 1)
The confusion matrix allows for visualization of the performance of an algorithm. Note that due to the size of data, here we don't provide a Cross-validation evaluation. In general, this type of evaluation is preferred.
def Confusion_Matrix(X_train_tensor = X_train_tensor, X_test_tensor = X_test_tensor, y_train = y_train, y_test = y_test):
# Train
y_pred = model(X_train_tensor.float())
y_pred = torch.max(y_pred.data, 1)[1]
y_pred = y_pred.cpu().data.numpy()
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
y_pred = model(X_test_tensor.float())
y_pred = torch.max(y_pred.data, 1)[1]
y_pred = y_pred.cpu().data.numpy()
CM_Test = metrics.confusion_matrix(y_test, y_pred)
# Font
font = FontProperties()
font.set_weight('bold')
Titles = ['Train Set', 'Test Set']
CM = [CM_Train, CM_Test]
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle(Titles[i], fontproperties=font, fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_title('Confusion Matrix');
_ = sns.heatmap(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis],
annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set_aspect(1)
return CM_Train, CM_Test
CM_Train, CM_Test = Confusion_Matrix()
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. In these cases, a balanced accuracy (bACC) [6] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Header('Train Set')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set', C = 'Green')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Train Set ========================================================================================== Precision (Train) = 0.68 Recall (Train) = 0.72 TPR (Train) = 0.72 TNR (Train) = 0.88 Balanced Accuracy (Train) = 0.80 Test Set =========================================================================================== Precision (Test) = 0.57 Recall (Test) = 0.61 TPR (Test) = 0.61 TNR (Test) = 0.83 Balanced Accuracy (Test) = 0.72 ====================================================================================================
Now for any given dataset, we can predict churn
Sample = df.sample(frac = 0.1)
X_sample = Sample.drop(columns = [Target]).values
if torch.cuda.is_available():
X_sample_tensor = Variable(torch.from_numpy(X_sample).cuda())
else:
X_sample_tensor = Variable(torch.from_numpy(X_sample))
y_pred = model(X_sample_tensor.float())
y_pred = np.asarray(y_pred.cpu().detach().numpy())
y_pred = pd.Series(y_pred.argmax(axis=1)).to_frame('Churn (Predicted)').applymap(lambda x: Labels[0] if x ==0 else Labels[1])
Predictions = pd.concat([Data.loc[Sample.index, ['customer ID','Churn']].reset_index(drop = True), y_pred], axis = 1)
Predictions['Churn'] = Predictions['Churn'].map(lambda x: Labels[0] if x ==0 else Labels[1])
display(Predictions.head(15))
| customer ID | Churn | Churn (Predicted) | |
|---|---|---|---|
| 0 | 0475-RIJEP | Yes | No |
| 1 | 3566-CAAYU | No | No |
| 2 | 7743-EXURX | Yes | Yes |
| 3 | 9586-JGQKH | No | No |
| 4 | 1271-UODNO | No | No |
| 5 | 9309-BZGNT | No | No |
| 6 | 5624-BQSSA | Yes | No |
| 7 | 9369-XFEHK | Yes | No |
| 8 | 4116-TZAQJ | Yes | Yes |
| 9 | 3134-DSHVC | No | No |
| 10 | 2876-VBBBL | Yes | Yes |
| 11 | 8421-WZOOW | No | No |
| 12 | 9677-AVKED | No | No |
| 13 | 6994-FGRHH | No | No |
| 14 | 8838-GPHZP | No | No |
Although the model is doing pretty well considering the complexity of this problem, we can improve the results by designing an iterative optimization that utilizes the accuracy and recall scores.